import torch
import torch_npu

device="npu:0"

# 矩阵大小为1k * 1k, 元素个数为1M, 占用大小为4M
# A、B、C三个矩阵大小为12M，小于A3的Cache 192MB
matrix_size = 1024
A = torch.randn(matrix_size, matrix_size, dtype=torch.float32).to(device)
B = torch.randn(matrix_size, matrix_size, dtype=torch.float32).to(device)

print("start")
for _ in range(50000):
    C = torch.matmul(A, B)
print("done")